In [1]:
# This was developed on Google Colab
%env HV_DOC_HTML=true
env: HV_DOC_HTML=true
In [2]:
# pip install -q holoviews hvplot
In [3]:
# Import required libraries and dependencies
import pandas as pd
import numpy as np
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
In [4]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
    "Resources/crypto_market_data.csv",
    index_col="coin_id")

# Display sample data
df_market_data.head(10)
Out[4]:
price_change_percentage_24h price_change_percentage_7d price_change_percentage_14d price_change_percentage_30d price_change_percentage_60d price_change_percentage_200d price_change_percentage_1y
coin_id
bitcoin 1.08388 7.60278 6.57509 7.67258 -3.25185 83.51840 37.51761
ethereum 0.22392 10.38134 4.80849 0.13169 -12.88890 186.77418 101.96023
tether -0.21173 0.04935 0.00640 -0.04237 0.28037 -0.00542 0.01954
ripple -0.37819 -0.60926 2.24984 0.23455 -17.55245 39.53888 -16.60193
bitcoin-cash 2.90585 17.09717 14.75334 15.74903 -13.71793 21.66042 14.49384
binancecoin 2.10423 12.85511 6.80688 0.05865 36.33486 155.61937 69.69195
chainlink -0.23935 20.69459 9.30098 -11.21747 -43.69522 403.22917 325.13186
cardano 0.00322 13.99302 5.55476 10.10553 -22.84776 264.51418 156.09756
litecoin -0.06341 6.60221 7.28931 1.21662 -17.23960 27.49919 -12.66408
bitcoin-cash-sv 0.92530 3.29641 -1.86656 2.88926 -24.87434 7.42562 93.73082
In [5]:
# Generate summary statistics
df_market_data.describe()
Out[5]:
price_change_percentage_24h price_change_percentage_7d price_change_percentage_14d price_change_percentage_30d price_change_percentage_60d price_change_percentage_200d price_change_percentage_1y
count 41.000000 41.000000 41.000000 41.000000 41.000000 41.000000 41.000000
mean -0.269686 4.497147 0.185787 1.545693 -0.094119 236.537432 347.667956
std 2.694793 6.375218 8.376939 26.344218 47.365803 435.225304 1247.842884
min -13.527860 -6.094560 -18.158900 -34.705480 -44.822480 -0.392100 -17.567530
25% -0.608970 0.047260 -5.026620 -10.438470 -25.907990 21.660420 0.406170
50% -0.063410 3.296410 0.109740 -0.042370 -7.544550 83.905200 69.691950
75% 0.612090 7.602780 5.510740 4.578130 0.657260 216.177610 168.372510
max 4.840330 20.694590 24.239190 140.795700 223.064370 2227.927820 7852.089700
In [6]:
# Plot your data to see what's in your DataFrame
hvplot.extension('bokeh')
df_market_data.hvplot.line(
    width=800,
    height=400,
    rot=90
)
Out[6]:

Prepare the Data¶

In [7]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
# Create a DataFrame with the scaled data and set the coinid column as index
df_market_data_scaled = StandardScaler().fit_transform(df_market_data)
df_market_data_scaled = pd.DataFrame(df_market_data_scaled,columns=df_market_data.columns, index=df_market_data.index)

# Display sample data
df_market_data_scaled.head()
Out[7]:
price_change_percentage_24h price_change_percentage_7d price_change_percentage_14d price_change_percentage_30d price_change_percentage_60d price_change_percentage_200d price_change_percentage_1y
coin_id
bitcoin 0.508529 0.493193 0.772200 0.235460 -0.067495 -0.355953 -0.251637
ethereum 0.185446 0.934445 0.558692 -0.054341 -0.273483 -0.115759 -0.199352
tether 0.021774 -0.706337 -0.021680 -0.061030 0.008005 -0.550247 -0.282061
ripple -0.040764 -0.810928 0.249458 -0.050388 -0.373164 -0.458259 -0.295546
bitcoin-cash 1.193036 2.000959 1.760610 0.545842 -0.291203 -0.499848 -0.270317

Find the Best Value for k Using the Original Data.¶

In [8]:
# Create a list with the number of k-values from 1 to 11
k_values = np.arange(1, 12).tolist()
print(k_values)
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
In [9]:
# Create a function to create inertia values to store in list
def get_inertia(k, df):
    k_model = KMeans(n_clusters=k, random_state=1, n_init='auto')
    k_model.fit(df)
    return k_model.inertia_


# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list

inertia_values = [get_inertia(k, df_market_data_scaled) for k in k_values]
In [10]:
# Create a DataFrame with the data to plot the Elbow curve
elbow_df = pd.DataFrame({'k': k_values, 'inertia': inertia_values})
elbow_df
Out[10]:
k inertia
0 1 287.000000
1 2 212.123342
2 3 145.897940
3 4 131.457370
4 5 66.317106
5 6 57.402668
6 7 49.212644
7 8 44.799804
8 9 33.859468
9 10 29.250314
10 11 27.187162
In [11]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
hvplot.extension('bokeh')
scaled_elbow = elbow_df.hvplot.line(x='k', y='inertia', xticks=k_values,    width=800,
    height=400, title='Scaled Elbow')
scaled_elbow
Out[11]:

Answer the following question:¶

Question: What is the best value for k?

Answer: k=5


Cluster Cryptocurrencies with K-means Using the Original Data¶

In [12]:
# Initialize the K-Means model using the best value for k
model_scaled = KMeans(n_clusters=5, random_state=1, n_init='auto')
In [13]:
# Fit the K-Means model using the scaled data
model_scaled.fit(df_market_data_scaled)
Out[13]:
KMeans(n_clusters=5, n_init='auto', random_state=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=5, n_init='auto', random_state=1)
In [14]:
# Predict the clusters to group the cryptocurrencies using the scaled data
prediction_scaled = model_scaled.predict(df_market_data_scaled)

# Print the resulting array of cluster values.
for i in prediction_scaled:
  print(i)
1
1
2
2
1
1
1
1
1
2
0
2
2
1
2
2
2
2
1
2
0
1
2
2
2
2
2
0
1
2
2
2
3
2
0
0
4
0
2
2
0
In [15]:
# Add a new column to the DataFrame with the predicted clusters
df_market_data_scaled['classification_scaled'] = prediction_scaled

# Display sample data
df_market_data_scaled.head()
Out[15]:
price_change_percentage_24h price_change_percentage_7d price_change_percentage_14d price_change_percentage_30d price_change_percentage_60d price_change_percentage_200d price_change_percentage_1y classification_scaled
coin_id
bitcoin 0.508529 0.493193 0.772200 0.235460 -0.067495 -0.355953 -0.251637 1
ethereum 0.185446 0.934445 0.558692 -0.054341 -0.273483 -0.115759 -0.199352 1
tether 0.021774 -0.706337 -0.021680 -0.061030 0.008005 -0.550247 -0.282061 2
ripple -0.040764 -0.810928 0.249458 -0.050388 -0.373164 -0.458259 -0.295546 2
bitcoin-cash 1.193036 2.000959 1.760610 0.545842 -0.291203 -0.499848 -0.270317 1
In [16]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
# hvplot.extension('bokeh')
scaled_scatter = df_market_data_scaled.hvplot.scatter(x='price_change_percentage_24h', y='price_change_percentage_7d', by='classification_scaled', hover_cols='coin_id',    width=800,
    height=400, title='Scaled Scatter')
scaled_scatter
Out[16]:

Optimize Clusters with Principal Component Analysis.¶

In [17]:
# Create a PCA model instance and set `n_components=3`.
pca_model = PCA(n_components=3)
In [18]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.
pca = pca_model.fit_transform(df_market_data_scaled.iloc[:, :-1])
pca_df = pd.DataFrame(pca, columns=['PC1','PC2','PC3'], index=df_market_data_scaled.index)
# View the first five rows of the DataFrame.
display(pca_df.head())
PC1 PC2 PC3
coin_id
bitcoin -0.600667 0.842760 0.461595
ethereum -0.458261 0.458466 0.952877
tether -0.433070 -0.168126 -0.641752
ripple -0.471835 -0.222660 -0.479053
bitcoin-cash -1.157800 2.041209 1.859715
In [19]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
print(pca_model.explained_variance_ratio_)
print(sum(pca_model.explained_variance_ratio_))
[0.3719856  0.34700813 0.17603793]
0.8950316570309842

Answer the following question:¶

Question: What is the total explained variance of the three principal components?

Answer: 0.895031657030984


Find the Best Value for k Using the PCA Data¶

In [20]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
pca_inertia_values = [get_inertia(k, pca_df) for k in k_values]
# Create a DataFrame with the data to plot the Elbow curve
pca_elbow_df = pd.DataFrame({'k': k_values, 'inertia': pca_inertia_values})
hvplot.extension('bokeh')
pca_elbow = pca_elbow_df.hvplot.line(x='k', y='inertia', xticks=k_values, width=800,
    height=400, title='PCA Elbow')
pca_elbow
Out[20]:

Answer the following questions:¶

  • Question: What is the best value for k when using the PCA data?

    • Answer: 4
  • Question: Does it differ from the best k value found using the original data?

    • Answer: Yes

Cluster Cryptocurrencies with K-means Using the PCA Data¶

In [21]:
# Initialize the K-Means model using the best value for k
model_pca = KMeans(n_clusters=4, random_state=1, n_init='auto')
In [22]:
# Fit the K-Means model using the PCA data
model_pca.fit(df_market_data_scaled.iloc[:,:-1])
Out[22]:
KMeans(n_clusters=4, n_init='auto', random_state=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=4, n_init='auto', random_state=1)
In [23]:
# Predict the clusters to group the cryptocurrencies using the PCA data
prediction_pca = model_pca.predict(df_market_data_scaled.iloc[:,:-1])
# Print the resulting array of cluster values.
for i in prediction_pca:
  print(i)
1
1
2
2
1
1
1
1
2
2
0
2
2
1
2
2
2
2
1
2
0
1
2
2
2
2
2
0
1
2
2
2
3
2
0
0
1
0
2
2
0
In [24]:
# Add a new column to the DataFrame with the predicted clusters
df_market_data_scaled['classification_pca'] = prediction_pca

# Display sample data
df_market_data_scaled.head()
Out[24]:
price_change_percentage_24h price_change_percentage_7d price_change_percentage_14d price_change_percentage_30d price_change_percentage_60d price_change_percentage_200d price_change_percentage_1y classification_scaled classification_pca
coin_id
bitcoin 0.508529 0.493193 0.772200 0.235460 -0.067495 -0.355953 -0.251637 1 1
ethereum 0.185446 0.934445 0.558692 -0.054341 -0.273483 -0.115759 -0.199352 1 1
tether 0.021774 -0.706337 -0.021680 -0.061030 0.008005 -0.550247 -0.282061 2 2
ripple -0.040764 -0.810928 0.249458 -0.050388 -0.373164 -0.458259 -0.295546 2 2
bitcoin-cash 1.193036 2.000959 1.760610 0.545842 -0.291203 -0.499848 -0.270317 1 1
In [25]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
# hvplot.extension('bokeh')
pca_scatter = df_market_data_scaled.hvplot.scatter(x='price_change_percentage_24h', y='price_change_percentage_7d', by='classification_pca', hover_cols='coin_id',   
    width=800, height=400, title='PCA Scatter')
pca_scatter
Out[25]:

Visualize and Compare the Results¶

In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [26]:
# Composite plot to contrast the Elbow curves
# hvplot.extension('bokeh')
scaled_elbow + pca_elbow
Out[26]:
In [27]:
# Composite plot to contrast the clusters
# hvplot.extension('bokeh')
scaled_scatter + pca_scatter
Out[27]:

Answer the following question:¶

  • Question: After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  • Answer: There wasn't much of a difference between the scaled cluster and the PCA cluster other than identifying/classifying a significant outlier. Personally, the PCA Cluster included an extra data point into the second classification vs the scaled method. The PCA method appears to have performed a more concise visual classifcation of the data provided.